|
DS project: Customer satisfactionTopic Modeling - BERTopic |
Date : 2023
Author : Elie MAZE, Luc Thomas
Goal : Trouver les topics à partir d'avis de clients et estimer comment ils sont liés à des sujets de mécontentements ou de satisfaction.
import _mypath
import os
import pandas as pd
import numpy as np
from threadpoolctl import threadpool_limits
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import manifold
from random import choice
from pprint import pprint
import nltk
nltk.data.path.append("/data/DATALAB_PAU/18_HPC/nltk_data")
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from threadpoolctl import threadpool_limits
import torch
from transformers import FlaubertModel, FlaubertTokenizer, CamembertTokenizer, CamembertModel
from training.predict import encode_inputs, getEmbeddings, getTextsEmbeddings
from data.processing import plot_word_cloud
from visu.visu import plotVectors
import gc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 1
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
SETS = [str(i) for i in range(1,6)]
MARKERS = ['square', 'circle', 'asterisk', 'triangle', 'diamond']
stop_words = stopwords.words('french')
os.environ["CUDA_VISIBLE_DEVICES"]="15"
os.environ["LD_LIBRARY_PATH"]="/data/appli_PITSI/HGX2//nvidia/cuda/install/cuda_11.2.1/lib64/:/data/appli_PITSI/HGX2/nvidia/cuda/install/cuda_11.2.1_compat/lib64:/data/appli_PITSI/HGX2/nvidia/cudnn/install/cudnn-11.2-linux-x64-v8.1.1.33/lib64"
# selecting a GPU with PyTorch (choose a free GPU!)
GPU_NUMBER=15
# setting device on GPU if available, else CPU
device = torch.device("cuda:{0}".format(GPU_NUMBER) if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
print("__Used Logical Devices: {0}".format(device))
print("__CUDNN VERSION:", torch.backends.cudnn.version())
print("__Device Name:",torch.cuda.get_device_name(GPU_NUMBER))
print("__Device Total Memory: {} GB".format(round(torch.cuda.get_device_properties(device).total_memory/1024**3,1)))
print("__Memory Usage: Allocated {0} GB, Cached {1} GB".format(
round(torch.cuda.memory_allocated(device)/1024**3,1),
round(torch.torch.cuda.memory_reserved(device)/1024**3,1)))
__Used Logical Devices: cuda:15 __CUDNN VERSION: 8101 __Device Name: Tesla V100-SXM3-32GB __Device Total Memory: 31.7 GB __Memory Usage: Allocated 0.0 GB, Cached 0.5 GB
PRJ_FOLDER = "/data/DATALAB_PAU/20_projects/j0215602/DS_NLP"
#PRJ_FOLDER = r"D:\DevPy\DS_NLP"
DATA_FOLDER = os.path.join(PRJ_FOLDER, "input", "processed")
RAW_MODEL_FOLDER = os.path.join(PRJ_FOLDER, "models")
_MAX_WORKERS = 8
infile = os.path.join(DATA_FOLDER, "processed_dataset.csv")
df = pd.read_csv(infile, sep=",", encoding="utf-8", engine="python")
print("data set:", df.shape)
data set: (105511, 11)
df.head(5)
| Commentaire | star | date | client | reponse | source | company | langage | cleaned_words | cleaned_lemma | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Colis commandé mardi 28 fevrier 23 , livraiso... | 1 | 2023-03-07 | Toto | NaN | TrustPilot | Amazon | fr | colis commandé mardi fevrier livraison jeu... | colis commander mardi fevrier livraison je... | __label__NEGATIVE |
| 1 | Amazon avec sa politique de retour est la pire... | 1 | 2023-03-07 | nasri eddine | NaN | TrustPilot | Amazon | fr | amazon politique pire contrairement ... | amazon politique pire contrairement ... | __label__NEGATIVE |
| 2 | Dieu sait que j'en connais des déboires avec l... | 4 | 2023-03-07 | Amandine | NaN | TrustPilot | Amazon | fr | dieu connais déboires sites marchands ... | dieu connaître déboire site marchand a... | __label__POSITIVE |
| 3 | Nul, preleve une commande que je n'ai jamais r... | 1 | 2023-03-07 | Bob Brico | NaN | TrustPilot | Amazon | fr | preleve commande jamais reçu service cl... | prelev commande jamais recevoir service... | __label__NEGATIVE |
| 4 | Colis soit disant livré mais jamais reçu donc ... | 1 | 2023-03-06 | Client | NaN | TrustPilot | Amazon | fr | colis disant livré jamais reçu perdu non re... | colis dire livrer jamais recevoir perdre no... | __label__NEGATIVE |
df["Commentaire"].str.len().hist(bins=50, log=True)
<AxesSubplot:>
threshold = 40
mask_comments = (df["Commentaire"].str.strip().str.len()>threshold)
print(f"ratio of comments longer than {threshold:d} characters: {100*mask_comments.sum() / df.shape[0]:.2f}")
ratio of comments longer than 40 characters: 80.49
SAMPLE_NB = 2000
print("samples per star:", SAMPLE_NB)
tmp = pd.DataFrame(df[mask_comments])
samples = []
for star in range(1,6):
satisfaction = "negative"
if star==3:
satisfaction = "neutral"
elif star>3:
satisfaction = "positive"
mask = (tmp["star"]==star)
sample = tmp.loc[mask, ["Commentaire", "cleaned_words", "cleaned_lemma", "star"]].sample(n=SAMPLE_NB)
sample["satisfaction"] = satisfaction
samples += [sample]
samples = pd.concat(samples)
samples = samples.reset_index(drop=True)
print("Samples:", samples.shape)
samples.head(5)
samples per star: 2000 Samples: (10000, 5)
| Commentaire | cleaned_words | cleaned_lemma | star | satisfaction | |
|---|---|---|---|---|---|
| 0 | livraison 24h facturé 8euro receptionné en 5 j... | livraison 24h facturé 8euro receptionné jour... | livraison 24h facturer 8euro receptionné jou... | 1 | negative |
| 1 | votre assistant sois disant là pour aider, il ... | assistant sois disant aider sert rien ... | assistant être dire aider servir rien ... | 1 | negative |
| 2 | ce sont de gros charlatent. Je commande, je pa... | gros charlatent commande paie reçois ri... | gros charlatent commande payer recevoir... | 1 | negative |
| 3 | Après avoir commandé un samsung galaxy A50 le ... | commandé samsung galaxy a50 2/10 devait l... | commander samsung galaxy a50 2/10 devoir ... | 1 | negative |
| 4 | Amazone: Un catalogue web conçu par une équipe... | amazone catalogue web conçu équipe incapa... | amazone catalogue web concevoir équipe in... | 1 | negative |
reviews = samples["Commentaire"].tolist()
cleaned_reviews = samples["cleaned_words"].tolist()
lemma_reviews = samples["cleaned_lemma"].tolist()
print("reviews:", len(reviews), end="\n\n")
print(reviews[10], end="\n\n")
print("lemmas:")
print(lemma_reviews[10])
reviews: 10000 j'ai commané un canapé a 419 €, et pris carte floabank pour obtenir une remise proposée de 105 €. la livraison n' a pas été effectuée, j'ai appelé et demander a changer d'adresse, au premier abord pas de probleme. puis Cdiscount a pretexté une rupture de stock alors que le canapé etait chez le transporteur. cdiscount a pretexté un defaut sur le canapé et a annulé la commande. ils m'ont propose une remboursement de 265 € et 49 € de bons d'achsts alors que j'avais payé 315€ d . apres multiples appels au service clients, ils me propose de me rembourser 315 € . donc au final : 1 heure au telephone, je n'ai pas le canapé et etb je paye une carte floabank 15 €/ an pour rien !!! lemmas: commaner canapé prendre carte floabank obtenir remise proposer livraison être effectuer appeler demander changer adresse probleme cdiscount pretexter rupture stock canapé transporteur cdiscount pretexter defaut canapé annuler commande proposer remboursement bon achsts payer multiple appel service client proposer rembourser final heure telephone canapé etb payer carte floabank an rien
MODEL_FOLDER = os.path.join(RAW_MODEL_FOLDER, "sentence-camembert-base")
sentence_model = SentenceTransformer(MODEL_FOLDER).to(device)
embeddings = sentence_model.encode(reviews, show_progress_bar=True, normalize_embeddings=True)
print("embeddings shape:", embeddings.shape)
Batches: 100%|██████████| 313/313 [00:13<00:00, 23.21it/s]
embeddings shape: (10000, 768)
OUT_EMBEDDINGS = os.path.join(DATA_FOLDER, "sentences_embeddings.npy")
np.save(OUT_EMBEDDINGS, embeddings)
stars = samples["star"].astype(str).tolist()
targets = samples["star"].tolist()
plotVectors(embeddings, reviews, serie_values=stars, hue_value=targets, legend=SETS, markers=MARKERS, figsize=(900, 600), method="tsne")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model, n_gram_range=(1,3), min_topic_size=3, nr_topics=10)
with threadpool_limits(limits=_MAX_WORKERS):
topics, probs = topic_model.fit_transform(lemma_reviews, embeddings)
samples["topic_sentences"] = topics
print("topic range:", min(topics), "to", max(topics))
topic range: -1 to 8
topic_information = topic_model.get_topic_info()
topic_information
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 4261 | -1_commande_client_livraison_site |
| 1 | 0 | 3801 | 0_cdiscount_commande_livraison_client |
| 2 | 1 | 1780 | 1_rapide_satisfaire_choix_bon |
| 3 | 2 | 74 | 2_attendre_attendre livraison_attendre recevoi... |
| 4 | 3 | 56 | 3_amliorer_amlioration_site bon_manque choix |
| 5 | 4 | 13 | 4_commende_passer commende_virement_14032020 |
| 6 | 5 | 4 | 5_nickel cdiscount topefficacit_topefficacit r... |
| 7 | 6 | 4 | 6_toujour_20avril_livre depuit 20avril_termineje |
| 8 | 7 | 4 | 7_travailler bon fter_masquer bon comprehensio... |
| 9 | 8 | 3 | 8_vouloir informer site_informer site votrelit... |
topic_model.visualize_barchart()
topic_model.visualize_topics()
for topic in topic_information["Topic"].values:
mask = (samples["topic_sentences"]==topic)
text = " ".join(samples.loc[mask, "cleaned_lemma"].tolist())
plot_word_cloud(text, "topic #"+str(topic), stop_words,color='white', max_words=200, figsize=(8, 4))
stars = samples["topic_sentences"].astype(str).tolist()
targets = samples["topic_sentences"].tolist()
SET_TOPICS = [str(i) for i in range(samples["topic_sentences"].min(), samples["topic_sentences"].max()+1)]
MARKERS_TOPICS = ['square', 'circle', 'asterisk', 'triangle', 'diamond', "cross", "dash", "star", "plus", "hex"]
plotVectors(embeddings, reviews, serie_values=stars, hue_value=targets, legend=SET_TOPICS, markers=MARKERS_TOPICS, figsize=(900, 600), method="tsne")
index = choice(range(0, samples.shape[0]-1))
text, star = samples[["Commentaire", "star"]].values[index]
print("star:", star, end="\n\n")
print(text, end="\n\n")
preds, probs = topic_model.transform([text], embeddings[index].reshape(1,-1))
top_topic = preds[0]
print("topic #", top_topic, end="\n\n")
pprint(topic_model.get_topic(top_topic))
star: 5
très bien je suis très heureux de vos services
topic # 1
[('rapide', 0.3777299644870111),
('satisfaire', 0.33875881475626346),
('choix', 0.3326373447455238),
('bon', 0.33142309441646484),
('satisfait', 0.32866808563836697),
('prix', 0.32733711284021005),
('recommander', 0.3117032592335597),
('livraison rapide', 0.3116419698824498),
('livraison', 0.31162375064786274),
('cdiscount', 0.3105385377537093)]
if torch.cuda.is_available():
print("__Used Logical Devices: {0}".format(device))
print("__CUDNN VERSION:", torch.backends.cudnn.version())
print("__Device Name:",torch.cuda.get_device_name(GPU_NUMBER))
print("__Device Total Memory: {} GB".format(round(torch.cuda.get_device_properties(device).total_memory/1024**3,1)))
print("__Memory Usage: Allocated {0} GB, Cached {1} GB".format(
round(torch.cuda.memory_allocated(device)/1024**3,1),
round(torch.torch.cuda.memory_reserved(device)/1024**3,1)))
__Used Logical Devices: cuda:15 __CUDNN VERSION: 8101 __Device Name: Tesla V100-SXM3-32GB __Device Total Memory: 31.7 GB __Memory Usage: Allocated 0.0 GB, Cached 0.5 GB
MODEL_PATH = os.path.join(RAW_MODEL_FOLDER, "flaubert-base-uncased")
tokenizer = FlaubertTokenizer.from_pretrained(MODEL_PATH)
flaubert_model = FlaubertModel.from_pretrained(MODEL_PATH).to(device)
Some weights of the model checkpoint at /data/DATALAB_PAU/20_projects/j0215602/DS_NLP/models/flaubert-base-uncased were not used when initializing FlaubertModel: ['pred_layer.proj.bias', 'pred_layer.proj.weight'] - This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
embeddings = getTextsEmbeddings(reviews, tokenizer, flaubert_model, device, batch_size=32, mean=True)
print("embeddings shape:", embeddings.shape)
100%|██████████| 313/313 [00:30<00:00, 10.31it/s]
embeddings shape: (10000, 768)
OUT_EMBEDDINGS = os.path.join(DATA_FOLDER, "flaubert_pretrained_embeddings.npy")
np.save(OUT_EMBEDDINGS, embeddings)
stars = samples["star"].astype(str).tolist()
targets = samples["star"].tolist()
plotVectors(embeddings, reviews, serie_values=stars, hue_value=targets, legend=SETS, markers=MARKERS, figsize=(900, 600), method="tsne")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model, n_gram_range=(1,3), min_topic_size=3, nr_topics=10)
with threadpool_limits(limits=_MAX_WORKERS):
topics, probs = topic_model.fit_transform(lemma_reviews, embeddings)
samples["topic_flaubert"] = topics
print("topic range:", min(topics), "to", max(topics))
topic range: 0 to 3
topic_information = topic_model.get_topic_info()
topic_information
| Topic | Count | Name | |
|---|---|---|---|
| 0 | 0 | 13 | 0_bien passer_bien passer recommander_passer r... |
| 1 | 1 | 5986 | 1_commande_cdiscount_client_livraison |
| 2 | 2 | 1223 | 2_commande_site_livraison_cdiscount |
| 3 | 3 | 2778 | 3_livraison_bon_commande_prix |
topic_model.visualize_barchart()
topic_model.visualize_topics()
for topic in topic_information["Topic"].values:
mask = (samples["topic_flaubert"]==topic)
text = " ".join(samples.loc[mask, "cleaned_lemma"].tolist())
plot_word_cloud(text, "topic #"+str(topic), stop_words,color='white', max_words=200, figsize=(8, 4))
stars = samples["topic_flaubert"].astype(str).tolist()
targets = samples["topic_flaubert"].tolist()
SET_TOPICS = [str(i) for i in range(samples["topic_flaubert"].min(), samples["topic_flaubert"].max()+1)]
MARKERS_TOPICS = ['square', 'circle', 'asterisk', 'triangle', 'diamond', "cross", "dash", "star", "plus", "hex"]
plotVectors(embeddings, reviews, serie_values=stars, hue_value=targets, legend=SET_TOPICS, markers=MARKERS_TOPICS, figsize=(900, 600), method="tsne")
#index = choice(range(0, samples.shape[0]-1))
print("index:", index)
text, star = samples[["Commentaire", "star"]].values[index]
print("star:", star, end="\n\n")
print(text, end="\n\n")
preds, probs = topic_model.transform([text], embeddings[index].reshape(1,-1))
top_topic = preds[0]
print("topic #", top_topic, end="\n\n")
pprint(topic_model.get_topic(top_topic))
index: 8878
star: 5
très bien je suis très heureux de vos services
topic # 3
[('livraison', 0.3764927652573101),
('bon', 0.34295199221636163),
('commande', 0.3272348052254968),
('prix', 0.32185154509488456),
('site', 0.3212018629665768),
('produit', 0.31822687921602427),
('rapide', 0.30008133835728124),
('cdiscount', 0.2929950107269028),
('bien', 0.2856447895602052),
('choix', 0.2758332502679072)]
if torch.cuda.is_available():
print("__Used Logical Devices: {0}".format(device))
print("__CUDNN VERSION:", torch.backends.cudnn.version())
print("__Device Name:",torch.cuda.get_device_name(GPU_NUMBER))
print("__Device Total Memory: {} GB".format(round(torch.cuda.get_device_properties(device).total_memory/1024**3,1)))
print("__Memory Usage: Allocated {0} GB, Cached {1} GB".format(
round(torch.cuda.memory_allocated(device)/1024**3,1),
round(torch.torch.cuda.memory_reserved(device)/1024**3,1)))
__Used Logical Devices: cuda:15 __CUDNN VERSION: 8101 __Device Name: Tesla V100-SXM3-32GB __Device Total Memory: 31.7 GB __Memory Usage: Allocated 0.5 GB, Cached 5.7 GB
MODEL_PATH = os.path.join(RAW_MODEL_FOLDER, "camembert-base")
tokenizer = CamembertTokenizer.from_pretrained(MODEL_PATH)
camembert_model = CamembertModel.from_pretrained(MODEL_PATH).to(device)
Some weights of the model checkpoint at /data/DATALAB_PAU/20_projects/j0215602/DS_NLP/models/camembert-base were not used when initializing CamembertModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias'] - This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
embeddings = getTextsEmbeddings(reviews, tokenizer, camembert_model, device, batch_size=32, mean=True)
print("embeddings shape:", embeddings.shape)
100%|██████████| 313/313 [00:23<00:00, 13.15it/s]
embeddings shape: (10000, 768)
OUT_EMBEDDINGS = os.path.join(DATA_FOLDER, "camembert_pretrained_embeddings.npy")
np.save(OUT_EMBEDDINGS, embeddings)
stars = samples["star"].astype(str).tolist()
targets = samples["star"].tolist()
plotVectors(embeddings, reviews, serie_values=stars, hue_value=targets, legend=SETS, markers=MARKERS, figsize=(900, 600), method="tsne")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model, n_gram_range=(1,3), min_topic_size=3, nr_topics=10)
with threadpool_limits(limits=_MAX_WORKERS):
topics, probs = topic_model.fit_transform(lemma_reviews, embeddings)
samples["topic_camembert"] = topics
print("topic range:", min(topics), "to", max(topics))
topic range: -1 to 8
topic_information = topic_model.get_topic_info()
topic_information
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 79 | -1_client annee_probleme_jour client_annee |
| 1 | 0 | 9832 | 0_commande_livraison_cdiscount_site |
| 2 | 1 | 41 | 1_probleme_client annee jamais_signalercommand... |
| 3 | 2 | 11 | 2_livrefacilite commande bon_produit non dispo... |
| 4 | 3 | 9 | 3_floabank_fois payer_payer fois_identifiant m... |
| 5 | 4 | 8 | 4_livraison rencontre_evaluer produit objectif... |
| 6 | 5 | 6 | 5_voir qualite livraison_chose faciliter paimm... |
| 7 | 6 | 5 | 6_015_malgre maint appel_proposer mise panier_... |
| 8 | 7 | 5 | 7_gratuite avedc carte_discount perimee renouv... |
| 9 | 8 | 4 | 8_fois refregerateur frigo_marche delai passer... |
topic_model.visualize_barchart()
topic_model.visualize_topics()
for topic in topic_information["Topic"].values:
mask = (samples["topic_camembert"]==topic)
text = " ".join(samples.loc[mask, "cleaned_lemma"].tolist())
plot_word_cloud(text, "topic #"+str(topic), stop_words,color='white', max_words=200, figsize=(8, 4))
stars = samples["topic_camembert"].astype(str).tolist()
targets = samples["topic_camembert"].tolist()
SET_TOPICS = [str(i) for i in range(samples["topic_camembert"].min(), samples["topic_camembert"].max()+1)]
MARKERS_TOPICS = ['square', 'circle', 'asterisk', 'triangle', 'diamond', "cross", "dash", "star", "plus", "hex"]
plotVectors(embeddings, reviews, serie_values=stars, hue_value=targets, legend=SET_TOPICS, markers=MARKERS_TOPICS, figsize=(900, 600), method="tsne")
#index = choice(range(0, samples.shape[0]-1))
print("index:", index)
text, star = samples[["Commentaire", "star"]].values[index]
print("star:", star, end="\n\n")
print(text, end="\n\n")
preds, probs = topic_model.transform([text], embeddings[index].reshape(1,-1))
top_topic = preds[0]
print("topic #", top_topic, end="\n\n")
pprint(topic_model.get_topic(top_topic))
index: 8878
star: 5
très bien je suis très heureux de vos services
topic # 0
[('commande', 0.22348958375858222),
('livraison', 0.21447456427490422),
('cdiscount', 0.2138595237640434),
('site', 0.20933914475769735),
('client', 0.20534506198756247),
('produit', 0.20374227699074454),
('service', 0.20305010278599375),
('bien', 0.19437912242317623),
('tre', 0.19325708177466863),
('bon', 0.1928583066375681)]